import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff
We get the data from OWID and interpolate/upsample the missing values weekly.
d=pd.read_excel('https://covid.ourworldindata.org/data/owid-covid-data.xlsx')
max_date=max(d['date'])
print(max_date)
2022-05-28
If there is no single row with both Vaccinations and Excess Mortality data we have nothing to work with. So we filter such countries out completely.
country_corr=d.groupby('iso_code')[
['excess_mortality','total_vaccinations_per_hundred']
].corr().unstack().iloc[:,1].reset_index()
country_corr.columns=['iso_code','correlation']
countries_worth_looking_at=list(
country_corr[~country_corr['correlation'].isna()]
.sort_values(
'correlation'
,ascending=False
)['iso_code']
)
country_corr_dict=dict(zip(country_corr.iso_code, country_corr.iso_code + country_corr.correlation.apply( " {0:.2f}".format) ))
dfc = d[(d['iso_code'].isin(countries_worth_looking_at))].copy()
Some countries have the data reported monthly while others have missing data points. We resample everything to the weekly grane and interpolate via backfilling. This way the vaccination time series before the start of reporting are backfilled with a static value (first value in time serie), mostly close to zero. This makes the year 2020 kind of a synthetic control in regard to the vaccination treatment.
# resample needs datetime index
dfc['datetime'] = pd.to_datetime(dfc['date'])
dfc.index = dfc['datetime']
del dfc['datetime']
# we use the mean() function to fill the missing values with the NAs
dfi=dfc.groupby('iso_code').resample('W').mean().reset_index()
dfi.index = pd.to_datetime(dfi['datetime'])
del dfi['datetime']
# interpolation within countries
for i in range(len(dfi.iso_code.unique())):
mask = dfi.loc[:,'iso_code']==dfi.iso_code.unique()[i]
dfi[mask]=dfi[mask].interpolate(method='bfill')
dfi.reset_index(inplace=True)
print('Correlation total_vaccinations_per_hundred and excess_mortality ='
,'\x1b[1;30;30m{:.2f}\x1b[m'.format(
dfi[['excess_mortality', 'total_vaccinations_per_hundred']].corr().unstack()[1]))
Correlation total_vaccinations_per_hundred and excess_mortality = -0.05
At the moment of writing the correlation is close to zero (-0.04). Let's de-average the overal value by calculating the correlation per country.
We chart correlation between excess_mortality and total_vaccinations_per_hundred (and other vaccination metrics) per country to make sure the data is sane.
The Choropleth Charts give an idea how the correlation is distibuted over the globe. May be there is some role the seasonality plays in the resulting in-country corellation.
The Histograms show that the in-country correlation is kinda evenly distributed over the whole [-1:1] interval and there are no visible ouliers.
for metric in ['total_vaccinations_per_hundred'
, 'people_vaccinated_per_hundred'
, 'people_fully_vaccinated_per_hundred'
, 'total_boosters_per_hundred'
, 'new_cases_per_million' ]:
country_corr=dfc.groupby('iso_code')[
['excess_mortality', metric]
].corr().unstack().iloc[:,1].reset_index()
colname = 'correlation excess_mortality and ' + metric
country_corr.columns=['iso_code', colname]
fig = px.choropleth (
country_corr,
locationmode = 'ISO-3',
locations = 'iso_code',
color = colname,
)
fig.update_traces(
showlegend=False
, selector=dict(type='choropleth'))
fig.update_layout(
width=2048,
height=800,
title_text=colname,
geo=dict( showcoastlines=False,),
coloraxis_colorbar=dict( title='Scale, Correlation',),
)
fig.show('png')
fig=ff.create_distplot(
[country_corr.dropna()[colname]]
, [colname]
, bin_size=.2
, histnorm = 'probability'
)
fig.update_layout(
title_text='Country distribution over the correlation [-1:1] and kernel density estimation'
, xaxis_title="Countries over Correlation per Country"
, yaxis_title="Probabililty and Kernel Density Estimation"
, legend_x=0
)
fig.show()
# let's add a helper column for facet titles
dfi['cntr']=dfi['iso_code'].replace(country_corr_dict, inplace=False)
fig = px.line(dfi,
x='datetime',
y=['total_vaccinations_per_hundred','excess_mortality'],
facet_col='cntr',
category_orders={'iso_code':sorted(countries_worth_looking_at)},
facet_col_wrap=4,
facet_row_spacing=0.01,
height=6000, #width=800, # change to fit your output media
labels={
"value": "per hundred / percent",
"variable": "",
},
title="Country Charts and Corr, Time Series Backfilled Weekly, Jan 2020 - "+str(max_date),
)
fig.update_layout(
legend=dict(
yanchor="top",
y=0.999,
xanchor="left",
x=0.01
),
xaxis_title="Weekly Data Jan 2020 - Jan 2022"
)
fig.update_yaxes(range=[-50, 250])
fig.show() # we can't use 'png' here in JupyterLab as the charts get scrambled, a plotly bug?
For each vaccine we split all countries into two groups as per vaccine presence and calculate the correlation within those two group. Obviously there are countries with multiple vaccine present. And some vaccine may only come together in all countries. Yet the results are interesting and non-trivial.
v=pd.read_csv('https://github.com/owid/covid-19-data/raw/master/public/data/vaccinations/locations.csv', sep=',', quotechar='"'
, header=0, usecols=['iso_code', 'vaccines'], dtype={'iso_code':'str','vaccines':'str'} )
from sklearn.preprocessing import MultiLabelBinarizer
# convert to contain lists instead of plain text
# TODO: get rid of leading spaces or whatever blanks they are instead of dropping all spaces.
v['vaccines']=v['vaccines'].str.replace(r'[^a-zA-Z,]+',r'').str.split(',')
# break into indicator variables
mlb = MultiLabelBinarizer()
indicators = pd.DataFrame(mlb.fit_transform(v['vaccines']),
columns=mlb.classes_, index=v.index)
vac_list=list(mlb.classes_)
vac_list
['Abdala', 'COVIranBarekat', 'CanSino', 'Covaxin', 'EpiVacCorona', 'FAKHRAVAC', 'IMBCAMS', 'JohnsonJohnson', 'KCONVAC', 'KoviVacChumakov', 'Medigen', 'Moderna', 'Novavax', 'OxfordAstraZeneca', 'PfizerBioNTech', 'QazVac', 'RaziCovPars', 'SinopharmBeijing', 'SinopharmWuhan', 'Sinovac', 'Soberana', 'SoberanaPlus', 'SpikoGen', 'SputnikLight', 'SputnikV', 'Turkovac', 'ZF']
v = pd.concat([v.reset_index(drop=True),
indicators.reset_index(drop=True)], axis=1)
v.set_index('iso_code', drop=True, inplace=True)
dfi=dfi.join(v, on='iso_code')
def get_vac_corr(vac):
vac_corr=dfi.groupby(vac)['excess_mortality','total_vaccinations_per_hundred'].corr()
vac_corr1 = vac_corr.stack().reset_index()
vac_corr1.columns = ['vaccine_presence_ind','b','c','correlation']
vac_corr1.vaccine_presence_ind.replace([0,1],['absent','present'], inplace=True)
vac_corr1['vaccine_name']=vac
corr_per_vac = vac_corr1.loc[
vac_corr1.b.isin(['excess_mortality']) & vac_corr1.c.isin(['total_vaccinations_per_hundred'])
, ['vaccine_name','vaccine_presence_ind','correlation']
]
return corr_per_vac
vac_corrs = pd.concat( get_vac_corr(vac) for vac in vac_list).reset_index()
del vac_corrs['index']
print(vac_corrs)
vaccine_name vaccine_presence_ind correlation 0 Abdala absent -0.048003 1 COVIranBarekat absent -0.044921 2 COVIranBarekat present -0.223644 3 CanSino absent -0.043545 4 CanSino present -0.057186 5 Covaxin absent -0.045747 6 Covaxin present -0.001514 7 EpiVacCorona absent -0.049296 8 EpiVacCorona present 0.420166 9 FAKHRAVAC absent -0.044921 10 FAKHRAVAC present -0.223644 11 IMBCAMS absent -0.048003 12 JohnsonJohnson absent -0.026476 13 JohnsonJohnson present -0.076803 14 KCONVAC absent -0.048003 15 KoviVacChumakov absent -0.048003 16 Medigen absent -0.049068 17 Medigen present 0.160218 18 Moderna absent 0.010573 19 Moderna present -0.057900 20 Novavax absent -0.053577 21 Novavax present 0.036196 22 OxfordAstraZeneca absent 0.013858 23 OxfordAstraZeneca present -0.059773 24 PfizerBioNTech absent 0.060407 25 PfizerBioNTech present -0.052338 26 QazVac absent -0.049272 27 QazVac present 0.230302 28 RaziCovPars absent -0.044921 29 RaziCovPars present -0.223644 30 SinopharmBeijing absent -0.043461 31 SinopharmBeijing present 0.002624 32 SinopharmWuhan absent -0.042465 33 SinopharmWuhan present 0.076682 34 Sinovac absent -0.038962 35 Sinovac present -0.023576 36 Soberana absent -0.044921 37 Soberana present -0.223644 38 SoberanaPlus absent -0.048003 39 SpikoGen absent -0.044921 40 SpikoGen present -0.223644 41 SputnikLight absent -0.040890 42 SputnikLight present -0.068327 43 SputnikV absent -0.012948 44 SputnikV present -0.031509 45 Turkovac absent -0.048003 46 ZF absent -0.047824 47 ZF present -0.132376
fig = px.bar(vac_corrs
,x='vaccine_name'
,y='correlation'
,color='vaccine_presence_ind'
,color_discrete_sequence=["grey","blue",]
,barmode="group"
,title="""Correlation between excess_mortality and total_vaccinations_per_hundred
<br>in countries grouped by vaccine presence/absence."""
, height=600, width=1000, # change to fit your output media
).update_xaxes(categoryorder ='max descending').show()
Copyright 2021 Abbrivia GmbH https://www.abbrivia.com CC-BY (By Attribution) 4.0 https://creativecommons.org/licenses/by/4.0/legalcode Reuse our work freely!
All visualizations, and code produced in this notebook are completely open access under the Creative Commons BY license. You have the permission to use, distribute, and reproduce these in any medium, provided the source and authors are credited.
The data produced by third parties and made available by "Our World in Data" is subject to the license terms from the original third-party authors. Check the license of any third-party data before use and redistribution on 'https://ourworldindata.org/coronavirus' site (see below).
See the defintions and further discussion on the used dataset at the "Our World in Data" site https://ourworldindata.org/covid-vaccinations
The data is taken specifically from https://covid.ourworldindata.org/data/owid-covid-data.xlsx file
Hannah Ritchie, Edouard Mathieu, Lucas Rodés-Guirao, Cameron Appel, Charlie Giattino, Esteban Ortiz-Ospina, Joe Hasell, Bobbie Macdonald, Diana Beltekian and Max Roser (2020) - "Coronavirus Pandemic (COVID-19)". Published online at OurWorldInData.org. Retrieved from: 'https://ourworldindata.org/coronavirus' [Online Resource]
We use Excel file because it contains the data format information in itself. If you want to run this more often consider manually downloading the data and sourcing it locally as shown in the next line (commented out).